Adapted from Analyzing Baseball Data with R, by Marchi and Albert (CRC Press)
In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
DATA_DIR = '../data/lahman-csv_2015-01-24/'
In [2]:
teams = pd.read_csv(DATA_DIR+'Teams.csv')
teams.head()
Out[2]:
In [178]:
since_2001 = teams.loc[teams.yearID > 2000, ['teamID', 'yearID', 'lgID', 'G', 'W', 'L', 'R', 'RA']]
since_2001.head()
Out[178]:
In [179]:
since_2001 = since_2001.assign(RD=since_2001.R - since_2001.RA, Wpct=since_2001.W / (since_2001.W + since_2001.L))
In [180]:
axes = sns.regplot(since_2001.RD, since_2001.Wpct)
axes.set_xlabel('Run differential')
axes.set_ylabel('Winning percentage')
Out[180]:
Bayesian regression using PyMC3:
In [244]:
import pymc3 as pm
with pm.Model() as run_model:
pm.glm.glm('Wpct ~ RD', since_2001)
trace = pm.sample(1000)
pm.traceplot(trace, vars=['RD'])
pm.summary(trace)
In [245]:
with pm.Model() as run_model_resid:
β = pm.Normal('β', 0, 1e-6, shape=2)
μ = β[0] + β[1] * since_2001.RD
σ = pm.Uniform('σ', 0, 100)
wp = pm.Normal('wp', μ, σ**-2, observed=since_2001.Wpct)
ϵ = pm.Deterministic('ϵ', since_2001.Wpct.values - μ)
trace = pm.sample(2000)
In [246]:
pm.summary(trace[1000:], vars=['β'], roundto=6)
In [247]:
pm.forestplot(trace[1000:], vars=['ϵ'], ylabels=['']*len(since_2001))
Out[247]:
In [248]:
axes = sns.regplot(since_2001.RD, trace['ϵ'][1000:].mean(axis=0))
axes.set_ylabel('Residual')
axes.set_xlabel('Run differential')
Out[248]:
Using scikit-learn:
In [182]:
from sklearn import linear_model
Wpct = since_2001.Wpct.values
RD = since_2001.RD.reshape(-1,1)
regr = linear_model.LinearRegression()
regr.fit(RD, Wpct, since_2001.Wpct)
Out[182]:
In [183]:
regr.coef_
Out[183]:
In [184]:
Wpct_hat = regr.predict(RD)
ϵ = Wpct - Wpct_hat
In [185]:
axes = sns.regplot(since_2001.RD, ϵ)
axes.set_ylabel('Residual')
axes.set_xlabel('Run differential')
Out[185]:
In [186]:
regr.score(RD, Wpct)
Out[186]:
Root mean squared error (RMSE)
In [187]:
rmse = np.sqrt((ϵ**2).mean())
In [188]:
(np.abs(ϵ) < rmse).mean()
Out[188]:
In [189]:
(np.abs(ϵ) < 2*rmse).mean()
Out[189]:
Pythagorean wins for SF
In [243]:
R, RA = since_2001[['R', 'RA']].T.values
since_2001 = since_2001.assign(pythWins=162 * R**2 / (RA**2 + R**2))
In [191]:
since_2001[(since_2001.teamID=='SFN') & (since_2001.yearID==2011)]
Out[191]:
Load 2011 game logs
In [192]:
games_2011 = pd.read_csv('../data/gl2011.txt', header=None)
games_2011.columns = pd.read_csv('../data/game_log_header.txt', header=None).values[0]
games_2011.head()
Out[192]:
In [193]:
SF2011 = games_2011.loc[(games_2011.HomeTeam=='SFN') | (games_2011.VisitingTeam=='SFN'),
("VisitingTeam", "HomeTeam", "VisitorRunsScored", "HomeRunsScore")]
SF2011
Out[193]:
Calculate SF score differences
In [194]:
SF2011 = SF2011.assign(ScoreDiff=np.where(SF2011.HomeTeam=='SFN', SF2011.HomeRunsScore - SF2011.VisitorRunsScored,
SF2011.VisitorRunsScored - SF2011.HomeRunsScore))
SF2011 = SF2011.assign(Outcome=(SF2011.ScoreDiff > 0).replace({True:'win', False:'loss'}))
In [195]:
SF2011.groupby('Outcome').apply(lambda x: x.ScoreDiff.describe())
Out[195]:
In [196]:
SF2011.hist(column='ScoreDiff', by='Outcome')
Out[196]:
Losses were decided by larger margin than wins, leading to overperformance versus Pythagorean formula.
In [253]:
with pm.Model() as pythag_model:
exp = pm.Uniform('exp', 0, 5)
pyth_wins = pm.Deterministic('pyth_wins', 162 * R**exp / (RA**exp + R**exp))
wins = pm.Poisson('wins', pyth_wins, observed=since_2001.W.values)
trace = pm.sample(2000)
In [254]:
pm.traceplot(trace, vars=['exp'])
Out[254]:
In [255]:
pm.summary(trace, vars=['exp'])
In [203]:
results = games_2011[['VisitingTeam', 'HomeTeam', 'VisitorRunsScored', 'HomeRunsScore']]
In [204]:
results = results.assign(Winner=results.HomeTeam.where(results.HomeRunsScore > results.VisitorRunsScored,
results.VisitingTeam),
Difference=np.abs(results.VisitorRunsScored - results.HomeRunsScore))
In [205]:
results.head()
Out[205]:
In [206]:
one_run_games = results.query('Difference==1').Winner.value_counts()
one_run_games.name = 'OneRunGames'
Merge with residuals table
In [207]:
one_run_games.head()
Out[207]:
In [208]:
teams_2011 = since_2001.query('yearID==2011').set_index('teamID').join(one_run_games)
In [211]:
teams_2011 = teams_2011.assign(pythResid=teams_2011.W - teams_2011.pythWins)
In [212]:
sns.regplot(teams_2011.OneRunGames, teams_2011.pythResid, 'ro')
Out[212]:
Relationship between one-run wins and relief pitching
In [213]:
pitching = pd.read_csv('../data/lahman-csv_2015-01-24/Pitching.csv')
pitching.head()
Out[213]:
In [216]:
top_closers = pitching.query('GF > 50 & ERA < 2.5')[["playerID", "yearID", "teamID"]]
In [223]:
top_closers[top_closers.yearID==2011].set_index('teamID')
Out[223]:
In [234]:
teams_2011.head()
Out[234]:
In [241]:
top_closers_merged = top_closers[top_closers.yearID==2011].set_index('teamID').join(teams_2011, lsuffix='_a', rsuffix='_b')
In [242]:
top_closers_merged.pythResid.describe()
Out[242]:
In [ ]: